Imports and configs¶

In [1]:
import tensorflow as tf
import numpy as np
from collections import deque
import gymnasium as gym
import matplotlib.pyplot as plt
import matplotlib.animation
import time
import pickle
import os

# GPU setup immediately after imports
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0], 
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)])
    except RuntimeError as e:
        print(e)
2024-12-06 03:52:16.200622: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-06 03:52:17.016282: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-06 03:52:17.395980: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-06 03:52:20.002103: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:/usr/lib/mesa-diverted/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/mesa:/usr/lib/x86_64-linux-gnu/dri:/usr/lib/x86_64-linux-gnu/gallium-pipe
2024-12-06 03:52:20.002322: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:/usr/lib/mesa-diverted/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu/mesa:/usr/lib/x86_64-linux-gnu/dri:/usr/lib/x86_64-linux-gnu/gallium-pipe
2024-12-06 03:52:20.002346: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.

Handling code/VM crash¶

In [2]:
# Function to save training progress
def save_progress(episode, model, replay_buffer, rewards_history, steps_history, 
                 successful_landings, crashes, rolling_reward_avg, times, best_score, 
                 best_weights, start_time, last_print_time):
    save_dict = {
        'episode': episode,
        'replay_buffer': list(replay_buffer),  # Convert deque to list for pickling
        'rewards_history': rewards_history,
        'steps_history': steps_history,
        'successful_landings': successful_landings,
        'crashes': crashes,
        'rolling_reward_avg': rolling_reward_avg,
        'times': times,
        'best_score': best_score,
        'best_weights': best_weights,
        'start_time': start_time,
        'last_print_time': last_print_time
    }
    
    # Save the model weights
    model.save_weights('lunar_lander_checkpoint.h5')
    
    # Save other variables
    with open('training_progress.pkl', 'wb') as f:
        pickle.dump(save_dict, f)
    
    print(f"Progress saved at episode {episode + 1}")

# Check if we're resuming from a checkpoint
resume_training = os.path.exists('training_progress.pkl') and os.path.exists('lunar_lander_checkpoint.h5')

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

Initializations¶

In [3]:
# Create the environment
env = gym.make("LunarLander-v3", render_mode="rgb_array")

print("Num GPUs:", len(tf.config.list_physical_devices('GPU')))
print("GPU Available: ", tf.test.is_built_with_cuda())
print("Devices:", tf.config.list_physical_devices())

# ------------------------ 1. Create a simple DQN epsilon policy network with 4 output neurons (one per possible action).
# [Hint: DQN Agents use Epsilon greedy policy]        [15 points] 
# ------------------------ 2. Discuss the rationale of the activation functions & the loss function used in the network. [10 points]
input_shape = [8]  # LunarLander has 8 observations
n_outputs = 4      # LunarLander has 4 possible actions

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation="relu", input_shape=[8]),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(n_outputs)
])

# ------------------------ 3. Define the hyperparameters:
# [50 points]
# --------- (i) the number of iterations, 
batch_size = 64
# --------- (ii) the number of episodes
n_episodes = 1000
# --------- (iii) the maximum number of steps, and  
n_steps = 1000
training_start = 50
training_interval = 4
# --------- (iv) the discount factor at each step
gamma = 0.99
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer)

# Initialize or load training variables
if resume_training:
    print("Loading saved progress...")
    with open('training_progress.pkl', 'rb') as f:
        saved_progress = pickle.load(f)
    
    start_episode = saved_progress['episode'] + 1
    replay_buffer = deque(saved_progress['replay_buffer'], maxlen=100000)
    rewards_history = saved_progress['rewards_history']
    steps_history = saved_progress['steps_history']
    successful_landings = saved_progress['successful_landings']
    crashes = saved_progress['crashes']
    rolling_reward_avg = saved_progress['rolling_reward_avg']
    times = saved_progress['times']
    best_score = saved_progress['best_score']
    best_weights = saved_progress['best_weights']
    start_time = saved_progress['start_time']
    last_print_time = saved_progress['last_print_time']
    
    model.load_weights('lunar_lander_checkpoint.h5')
    print(f"Resuming training from episode {start_episode}")
else:
    print("Starting new training...")
    start_episode = 0
    replay_buffer = deque(maxlen=100000)
    rewards_history = []
    steps_history = []
    successful_landings = []
    crashes = []
    rolling_reward_avg = []
    times = []
    best_score = -float('inf')
    best_weights = None
    start_time = time.time()
    last_print_time = start_time

def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states, dones, truncateds = zip(*batch)
    return (np.array(states), np.array(actions), np.array(rewards), 
            np.array(next_states), np.array(dones), np.array(truncateds))

def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        Q_values = model.predict(state[np.newaxis], verbose=0)[0]
        return np.argmax(Q_values)

def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, truncated, info = env.step(action)
    replay_buffer.append((state, action, reward, next_state, done, truncated))
    return next_state, reward, done, truncated, info

def training_step(batch_size, gamma=0.99):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones, truncateds = experiences
    
    states = tf.cast(states, tf.float32)
    next_states = tf.cast(next_states, tf.float32)
    rewards = tf.cast(rewards, tf.float32)
    
    next_Q_values = model.predict(next_states, verbose=0)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    runs = 1.0 - (dones | truncateds)
    target_Q_values = rewards + runs * gamma * max_next_Q_values
    
    mask = tf.one_hot(actions, n_outputs, dtype=tf.float32)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1)
        loss = tf.reduce_mean(tf.square(target_Q_values - Q_values))
    
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    return loss
    
def get_epsilon(episode, n_episodes):
    epsilon_start = 1.0
    epsilon_end = 0.01
    return max(epsilon_end, epsilon_start - episode * (epsilon_start - epsilon_end) / n_episodes)
Num GPUs: 1
GPU Available:  True
Devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
2024-12-06 03:52:24.651468: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-06 03:52:26.795912: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4096 MB memory:  -> device: 0, name: NVIDIA L4, pci bus id: 0000:00:03.0, compute capability: 8.9
Loading saved progress...
Resuming training from episode 1000

Training¶

In [4]:
# ------------------------ 4. Train the agent on the LunarLander-v2 or v3 environment for a sufficient number of episodes
# to achieve a satisfactory level of performance. [10 points]
print("\nStarting training...\n")
print("Episode  Steps  Outcome  Avg-Reward(20)  Time(5ep)  Epsilon")
print("-" * 65)

# If resuming training, reconstruct and print the progress table
if resume_training and len(rewards_history) > 0:
    print("\nReconstructing progress table from saved data...\n")
    previous_episodes = len(rewards_history)
    times_len = len(times)

    # Initialize last_print_time for accurate time calculations
    if times_len > 0:
        last_print_time = times[0] - (times[1] - times[0]) if times_len > 1 else start_time
    else:
        last_print_time = start_time

    # Calculate the number of complete 5-episode intervals
    num_complete_intervals = previous_episodes // 5

    # Adjust the loop to prevent IndexError
    for idx in range(min(times_len, num_complete_intervals)):
        # Each time corresponds to episode_num = (idx + 1) * 5
        episode_num = (idx + 1) * 5
        episode_idx = episode_num - 1  # zero-based index

        # Check if episode_idx is within the bounds of the data
        if episode_idx >= previous_episodes:
            break  # No more data to process

        # Retrieve data for the episode
        episode_steps = steps_history[episode_idx]
        episode_reward = rewards_history[episode_idx]
        avg_reward = rolling_reward_avg[episode_idx] if episode_idx < len(rolling_reward_avg) else np.mean(rewards_history[max(0, episode_idx-19):episode_idx+1])
        epsilon = get_epsilon(episode_idx, n_episodes)
        current_time = times[idx]
        time_for_last_5 = current_time - last_print_time
        last_print_time = current_time
        final_reward = rewards_history[episode_idx]

        # Determine outcome based on success condition
        if final_reward >= 200:
            outcome = "SUCCESS"
        else:
            outcome = "CRASH"

        log_entry = f"{episode_num:7d}  {episode_steps:5d}  {outcome:7s}  {avg_reward:13.2f}  {time_for_last_5:9.2f}  {epsilon:.3f}"
        print(log_entry)


try:
    for episode in range(start_episode, n_episodes):
        obs, info = env.reset()
        epsilon = get_epsilon(episode, n_episodes)
        episode_reward = 0
        episode_steps = 0
        final_reward = 0
        
        for step in range(n_steps):
            obs, reward, done, truncated, info = play_one_step(env, obs, epsilon)
            episode_reward += reward
            episode_steps += 1
            
            if episode > training_start and step % training_interval == 0:
                if len(replay_buffer) >= batch_size:
                    training_step(batch_size, gamma)
            
            if done or truncated:
                final_reward = reward
                break
        
        # Update tracking
        rewards_history.append(episode_reward)
        steps_history.append(episode_steps)
        
        if episode_reward > best_score:
            best_score = episode_reward
            best_weights = model.get_weights()
        
        # Track outcomes
        if final_reward >= 200:
            successful_landings.append(episode)
            outcome = "SUCCESS"
        else:
            crashes.append(episode)
            outcome = "CRASH"
            
        recent_rewards = rewards_history[-20:] if len(rewards_history) >= 20 else rewards_history
        avg_reward = np.mean(recent_rewards)
        rolling_reward_avg.append(avg_reward)
        
        # Print progress and save checkpoint
        if (episode + 1) % 5 == 0:
            current_time = time.time()
            time_for_last_5 = current_time - last_print_time
            times.append(current_time)
            print(f"{episode+1:7d}  {episode_steps:5d}  {outcome:7s}  {avg_reward:13.2f}  {time_for_last_5:9.2f}  {epsilon:.3f}")
            last_print_time = current_time
            
        if (episode + 1) % 25 == 0:  # Save every 25 episodes
            save_progress(episode, model, replay_buffer, rewards_history, steps_history,
                         successful_landings, crashes, rolling_reward_avg, times,
                         best_score, best_weights, start_time, last_print_time)

except KeyboardInterrupt:
    print("\nTraining interrupted. Saving progress...")
    save_progress(episode, model, replay_buffer, rewards_history, steps_history,
                 successful_landings, crashes, rolling_reward_avg, times,
                 best_score, best_weights, start_time, last_print_time)
    print("Progress saved. You can resume training by running this script again.")
Starting training...

Episode  Steps  Outcome  Avg-Reward(20)  Time(5ep)  Epsilon
-----------------------------------------------------------------

Reconstructing progress table from saved data...

      5     84  CRASH          -266.27       0.21  0.996
     10     63  CRASH          -197.57       0.21  0.991
     15     84  CRASH          -209.90       0.19  0.986
     20     72  CRASH          -181.57       0.42  0.981
     25    105  CRASH          -156.85       0.34  0.976
     30    100  CRASH          -164.56       1.18  0.971
     35    124  CRASH          -131.34       1.19  0.966
     40    101  CRASH          -170.11       0.84  0.961
     45    105  CRASH          -199.42       1.38  0.956
     50     82  CRASH          -210.26       0.64  0.951
     55    104  CRASH          -217.51       5.51  0.947
     60    104  CRASH          -222.67       6.69  0.942
     65     88  CRASH          -185.79       7.96  0.937
     70    121  CRASH          -176.34       9.60  0.932
     75     84  CRASH          -183.65       8.26  0.927
     80     92  CRASH          -169.83      50.85  0.922
     85     80  CRASH          -200.28       8.40  0.917
     90     75  CRASH          -205.75       9.27  0.912
     95    136  CRASH          -217.30      10.10  0.907
    100    144  CRASH          -208.08      12.44  0.902
    105     83  CRASH          -197.87      10.01  0.897
    110     57  CRASH          -188.50      12.06  0.892
    115    139  CRASH          -170.83      10.74  0.887
    120    109  CRASH          -167.26      10.62  0.882
    125    130  CRASH          -168.44       9.55  0.877
    130     79  CRASH          -186.01      10.17  0.872
    135    105  CRASH          -190.65       9.27  0.867
    140    115  CRASH          -185.65       9.99  0.862
    145     75  CRASH          -171.23      11.42  0.857
    150    114  CRASH          -182.24      10.72  0.852
    155     82  CRASH          -194.78      11.84  0.848
    160    100  CRASH          -207.93       8.50  0.843
    165     72  CRASH          -195.95      11.58  0.838
    170     88  CRASH          -162.13      12.28  0.833
    175     81  CRASH          -145.02       9.22  0.828
    180    178  CRASH          -122.24       9.84  0.823
    185    140  CRASH          -139.40      15.75  0.818
    190     92  CRASH          -157.00      15.02  0.813
    195    113  CRASH          -181.77      10.59  0.808
    200    122  CRASH          -204.45      11.01  0.803
    205    101  CRASH          -216.89      18.19  0.798
    210    132  CRASH          -218.43      14.79  0.793
    215    116  CRASH          -198.76      17.81  0.788
    220     87  CRASH          -199.51      15.34  0.783
    225    106  CRASH          -185.98      14.61  0.778
    230     80  CRASH          -173.35      16.83  0.773
    235     74  CRASH          -171.05      14.88  0.768
    240    120  CRASH          -163.63      13.15  0.763
    245    125  CRASH          -181.66      16.26  0.758
    250    134  CRASH          -201.21      15.41  0.753
    255     88  CRASH          -220.77      17.33  0.749
    260     89  CRASH          -208.02      15.16  0.744
    265     81  CRASH          -183.12      14.65  0.739
    270     92  CRASH          -153.28      14.06  0.734
    275    147  CRASH          -128.49      16.12  0.729
    280    331  CRASH          -139.31      14.99  0.724
    285    115  CRASH          -139.56      22.91  0.719
    290    109  CRASH          -135.00      17.19  0.714
    295     94  CRASH          -127.97      41.17  0.709
    300    104  CRASH          -122.85      16.92  0.704
    305    105  CRASH          -119.49      16.97  0.699
    310    153  CRASH          -129.36      16.09  0.694
    315    137  CRASH          -175.33      21.51  0.689
    320    100  CRASH          -178.07      19.25  0.684
    325     87  CRASH          -181.68      23.70  0.679
    330    110  CRASH          -176.85      16.35  0.674
    335    155  CRASH          -136.21      20.49  0.669
    340     77  CRASH          -133.84      31.01  0.664
    345    107  CRASH          -128.55      22.94  0.659
    350    183  CRASH          -132.57      18.64  0.654
    355    183  CRASH          -122.89      22.72  0.650
    360    220  CRASH          -115.11      26.69  0.645
    365    114  CRASH          -106.84      26.51  0.640
    370    144  CRASH           -96.20      19.74  0.635
    375    172  CRASH           -94.91      29.18  0.630
    380    178  CRASH           -88.17      24.64  0.625
    385    105  CRASH           -98.76      23.98  0.620
    390    169  CRASH           -96.22      19.58  0.615
    395    156  CRASH           -93.03      20.82  0.610
    400    210  CRASH           -94.48      25.08  0.605
    405    109  CRASH           -83.73      27.28  0.600
    410    144  CRASH           -83.68      22.78  0.595
    415    262  CRASH          -103.65      24.92  0.590
    420    188  CRASH          -101.29      33.58  0.585
    425     85  CRASH           -97.88      24.35  0.580
    430    111  CRASH          -101.54      31.17  0.575
    435    324  CRASH          -101.07      58.44  0.570
    440    239  CRASH          -103.86      31.32  0.565
    445    143  CRASH           -92.38      27.62  0.560
    450    147  CRASH           -99.69      26.53  0.555
    455    179  CRASH           -84.90      30.22  0.551
    460     90  CRASH           -86.56      35.42  0.546
    465    201  CRASH          -104.66      27.57  0.541
    470    116  CRASH          -100.21      24.71  0.536
    475    270  CRASH          -103.19      24.39  0.531
    480    226  CRASH          -106.53    1639.60  0.526
    485    163  CRASH          -101.41      51.75  0.521
    490    309  CRASH          -119.64      38.55  0.516
    495    401  CRASH          -143.84      72.81  0.511
    500    229  CRASH          -142.94      50.19  0.506
    505    268  CRASH          -151.54      46.77  0.501
    510    210  CRASH          -144.94      47.21  0.496
    515    429  CRASH          -136.41      47.14  0.491
    520    359  CRASH          -158.75      69.06  0.486
    525    233  CRASH          -145.00      49.84  0.481
    530    396  CRASH          -166.57      87.87  0.476
    535    256  CRASH          -167.56      81.22  0.471
    540    303  CRASH          -158.09      57.13  0.466
    545    101  CRASH          -164.81      51.04  0.461
    550    383  CRASH          -142.63      44.32  0.456
    555     74  CRASH          -128.23      89.75  0.452
    560    112  CRASH          -112.33      66.44  0.447
    565    315  CRASH          -107.91      34.88  0.442
    570    231  CRASH           -95.05      45.53  0.437
    575    312  CRASH           -90.60      60.01  0.432
    580    141  CRASH           -92.03      60.96  0.427
    585    329  CRASH           -93.57      45.33  0.422
    590    363  CRASH           -90.46      67.75  0.417
    595    290  CRASH           -88.30      69.82  0.412
    600    163  CRASH           -80.28      58.01  0.407
    605    205  CRASH           -82.51      87.83  0.402
    610    314  CRASH           -87.75      64.39  0.397
    615    293  CRASH           -82.16     132.34  0.392
    620    695  CRASH           -84.02      90.38  0.387
    625    247  CRASH           -68.75      83.06  0.382
    630    263  CRASH           -49.70      84.46  0.377
    635    298  CRASH           -40.37     101.48  0.372
    640    370  CRASH           -44.19      88.37  0.367
    645    166  CRASH           -36.42     150.09  0.362
    650   1000  CRASH           -39.45      70.69  0.357
    655    289  CRASH           -50.90    1287.99  0.353
    660    522  CRASH           -34.48     149.06  0.348
    665    140  CRASH           -43.85     125.57  0.343
    670    884  CRASH           -61.39      67.85  0.338
    675    773  CRASH           -59.79     201.29  0.333
    680   1000  CRASH           -60.74     212.71  0.328
    685    577  CRASH           -57.21     215.41  0.323
    690    129  CRASH           -39.11      98.30  0.318
    695   1000  CRASH           -45.10     145.89  0.313
    700    669  CRASH           -53.00     145.18  0.308
    705   1000  CRASH           -55.90     255.22  0.303
    710   1000  CRASH           -56.61     274.75  0.298
    715   1000  CRASH           -42.90     218.48  0.293
    720   1000  CRASH           -40.63     267.23  0.288
    725   1000  CRASH           -39.28     233.35  0.283
    730   1000  CRASH           -48.44    2269.60  0.278
    735    592  CRASH           -52.45     276.45  0.273
    740   1000  CRASH           -41.03     251.22  0.268
    745   1000  CRASH           -40.71    2102.27  0.263
    750   1000  CRASH           -18.74     257.51  0.258
    755    767  CRASH           -11.29     280.59  0.254
    760   1000  CRASH           -11.53     240.87  0.249
    765   1000  CRASH            -0.45     279.94  0.244
    770   1000  CRASH            -7.39     268.41  0.239
    775    852  CRASH            -4.98     238.69  0.234
    780   1000  CRASH            -8.23    2077.79  0.229
    785   1000  CRASH           -12.16     289.84  0.224
    790   1000  CRASH            -9.23     247.55  0.219
    795   1000  CRASH            -6.98     288.94  0.214
    800    352  CRASH           -10.61     283.22  0.209
    805    929  CRASH            -0.89     204.79  0.204
    810    302  CRASH            -5.22     238.03  0.199
    815   1000  CRASH            -3.31     224.14  0.194
    820   1000  CRASH            19.97     265.62  0.189
    825   1000  CRASH            10.21     275.53  0.184
    830    467  SUCCESS          43.93    3073.26  0.179
    835    771  CRASH            50.69     170.22  0.174
    840   1000  CRASH            15.34     248.30  0.169
    845   1000  CRASH             0.84     193.13  0.164
    850   1000  CRASH           -45.64     259.73  0.159
    855    172  CRASH           -76.35     258.47  0.155
    860   1000  CRASH           -67.14     258.48  0.150
    865   1000  CRASH           -54.51     301.89  0.145
    870   1000  CRASH           -54.38     215.18  0.140
    875    823  CRASH           -48.69     266.22  0.135
    880   1000  CRASH           -53.05    1156.19  0.130
    885    176  CRASH           -67.24     312.10  0.125
    890   1000  CRASH           -52.93     262.44  0.120
    895    234  CRASH           -62.09     232.10  0.115
    900    295  CRASH           -61.73     166.67  0.110
    905    301  CRASH           -63.73     219.96  0.105
    910   1000  CRASH           -80.45     228.56  0.100
    915   1000  CRASH           -68.07     319.72  0.095
    920    248  CRASH           -55.86     275.78  0.090
    925   1000  CRASH           -32.34     269.67  0.085
    930   1000  CRASH             9.40     552.71  0.080
    935    181  CRASH            42.35     307.61  0.075
    940   1000  CRASH            35.32     200.61  0.070
    945   1000  CRASH            43.81     331.32  0.065
    950   1000  CRASH            25.92     322.58  0.060
    955   1000  CRASH            -2.59    3076.69  0.056
    960   1000  CRASH            -0.62     452.48  0.051
    965    681  SUCCESS           3.85     380.84  0.046
    970   1000  CRASH            13.88     433.52  0.041
    975    958  CRASH            21.40     362.63  0.036
    980   1000  CRASH            40.59    1629.61  0.031
    985   1000  CRASH            30.60     325.45  0.026
    990   1000  CRASH            26.49     320.70  0.021
    995    870  CRASH            33.18     272.47  0.016
   1000   1000  CRASH            14.84     327.51  0.011

Analysis¶

In [5]:
# ------------------------ 5. Analyze the agent's learning progress by plotting relevant performance metrics
# (e.g., cumulative rewards, episode length) over time. [10 points]
# Restore best weights
model.set_weights(best_weights)

# Final Analysis
def calculate_times_with_interpolation(timestamps):
    # Calculate the time differences
    times = [timestamps[1] - timestamps[0]]  # Assume the first difference equals the second
    times += [timestamps[i] - timestamps[i - 1] for i in range(1, len(timestamps))]

    # Identify indices with times > 1000
    for i in range(len(times)):
        if times[i] > 1000:
            # Interpolate using previous and next values
            prev_time = times[i - 1] if i > 0 else 0
            next_time = times[i + 1] if i + 1 < len(times) else 0
            times[i] = (prev_time + next_time) / 2

    return times
    
# Calculate corrected times
corrected_times = calculate_times_with_interpolation(times)
import datetime
total_time = np.sum(corrected_times)
total_time = str(datetime.timedelta(seconds=int(total_time)))
print("\nTraining Summary:")
print(f"Total training time: {total_time}")
print(f"Episodes hitting max steps: {sum(1 for s in steps_history if s >= 999)}/{n_episodes}")
print(f"Successful landings: {len(successful_landings)}/{n_episodes} ({len(successful_landings)/n_episodes*100:.1f}%)")
print(f"Crashes: {len(crashes)}/{n_episodes} ({len(crashes)/n_episodes*100:.1f}%)")
print(f"Best score achieved: {best_score:.2f}")

# Plot training results
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8, 10))

# Font size configuration
font_title = 14
font_label = 12
font_tick = 10
font_legend = 11

ax1.plot(rewards_history, alpha=0.6, label='Episode Reward')
ax1.plot(rolling_reward_avg, label='20-Episode Average', linewidth=2)
ax1.set_title('Training Progress - Rewards', fontsize=font_title)
ax1.set_xlabel('Episode', fontsize=font_label)
ax1.set_ylabel('Total Reward', fontsize=font_label)
ax1.tick_params(axis='both', labelsize=font_tick)
ax1.grid(True)
ax1.legend(fontsize=font_legend)

episodes = range(1, len(rewards_history) + 1)
ax2.scatter(successful_landings, [1]*len(successful_landings), color='green', label='Successful Landings', alpha=0.6)
ax2.scatter(crashes, [0]*len(crashes), color='red', label='Crashes', alpha=0.6)
ax2.set_title('Landing Outcomes', fontsize=font_title)
ax2.set_xlabel('Episode', fontsize=font_label)
ax2.set_yticks([0, 1])
ax2.set_yticklabels(['Crash', 'Success'], fontsize=font_tick)
ax2.tick_params(axis='both', labelsize=font_tick)
ax2.grid(True)
ax2.legend(fontsize=font_legend)

# Correct x-axis to match every 5 episodes, ending at 1000
x_axis_corrected = [i * 5 for i in range(len(corrected_times))]

ax3.plot(x_axis_corrected, corrected_times, color='blue', linewidth=2, label='Corrected Times')
ax3.set_title('Episode Times', fontsize=font_title)
ax3.set_xlabel('Episode', fontsize=font_label)  # Updated label to "Episode"
ax3.set_ylabel('Time (s)', fontsize=font_label)
ax3.tick_params(axis='both', labelsize=font_tick)
ax3.grid(True)
ax3.legend(fontsize=font_legend)

plt.tight_layout()
plt.show()
Training Summary:
Total training time: 6:12:25
Episodes hitting max steps: 229/1000
Successful landings: 15/1000 (1.5%)
Crashes: 985/1000 (98.5%)
Best score achieved: 312.91
No description has been provided for this image

Animations¶

In [6]:
def plot_animation(frames, repeat=False, interval=40):
    plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = matplotlib.animation.FuncAnimation(
        plt.gcf(), lambda i: patch.set_data(frames[i]),
        frames=len(frames), repeat=repeat, interval=interval)
    from IPython.display import HTML
    return HTML(anim.to_jshtml())  # This line is key for Jupyter display

def show_video(env, model, max_steps=2000):
    frames = []
    obs, _ = env.reset()
    total_reward = 0
    total_steps = 0
    
    for step in range(max_steps):
        frames.append(env.render())
        action = epsilon_greedy_policy(obs, epsilon=0)
        obs, reward, done, truncated, _ = env.step(action)
        total_reward += reward
        total_steps += 1
        if done or truncated:
            print("Landing status:", "Success!" if reward >= 200 else "Crash!")
            print(f"Total reward: {total_reward:.2f}")
            print(f"Total steps: {total_steps}")
            break
    
    env.close()
    return plot_animation(frames)
    
show_video(env, model)
2024-12-06 03:52:31.448371: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
Landing status: Crash!
Total reward: -60.75
Total steps: 1000
Out[6]:
No description has been provided for this image
No description has been provided for this image
In [7]:
# If using Spyder

# def show_video(env, model, max_steps=1000):
#     frames = []
#     obs, _ = env.reset()
#     total_reward = 0
#     total_steps = 0
    
#     for step in range(max_steps):
#         frames.append(env.render())
#         action = epsilon_greedy_policy(obs, epsilon=0)
#         obs, reward, done, truncated, _ = env.step(action)
#         total_reward += reward
#         total_steps += 1
#         if done or truncated:
#             print("Landing status:", "Success!" if reward >= 200 else "Crash!")
#             print(f"Total reward: {total_reward:.2f}")
#             print(f"Total steps: {total_steps}")
#             break
    
#     plt.figure(figsize=(8, 8))
#     for frame in frames:
#         plt.clf()
#         plt.imshow(frame)
#         plt.axis('off')
#         plt.pause(0.05)
    
#     plt.show()

# Show the final performance
# show_video(env, model)
# env.close()
In [ ]: